week 5: integers and other monsters

counts

populations and tools

Code
library(ggdag)

dag_coords <-
  tibble(name = c("C", "P", "L", "T"),
         x    = c(1, 1, 2, 2),
         y    = c(2, 1, 2, 1))

dagify(C ~ L + P,
       P ~ L,
       T ~ C + L + P,
       coords = dag_coords) %>%
  
  ggplot(aes(x = x, y = y, xend = xend, yend = yend)) +
  geom_dag_point(aes(color = name == "D"),
                 alpha = 1/2, size = 6.5, show.legend = F) +
  geom_point(x = 2, y = 1, 
             size = 6.5, shape = 1, stroke = 1, color = "orange") +
  geom_dag_text(color = "black") +
  geom_dag_edges() +
  scale_color_manual(values = c("#1c5253", "orange")) +
  scale_x_continuous(NULL, breaks = NULL, expand = c(.1, .1)) +
  scale_y_continuous(NULL, breaks = NULL, expand = c(.1, .1))

counts are poisson distributed

\[\begin{align*} Y_i &\sim \text{Poisson}(\lambda_i) \\ \text{log}(\lambda_i) &= \alpha + \beta x_i \\ \lambda_i &= \text{exp}(\alpha + \beta x_i) \end{align*}\]

You’ll need to be careful about your priors.

\[\begin{align*} \text{log}(\lambda_i) &= \alpha \\ \alpha &\sim \text{Normal}(0,10) \end{align*}\]

Simulate this:

Code
n_sim = 1e4
alpha_prior = rnorm(n_sim, 0, 10)
lambda_prior = exp(alpha_prior)
count_prior = rpois(n_sim, lambda_prior)
range(count_prior)
[1] 0.00000e+00 1.41739e+16
Code
mean(count_prior)
[1] 2.720336e+12
Code
data.frame(count = count_prior) %>% 
  ggplot(aes(x = count)) +
  geom_density() +
  scale_x_continuous(limits = c(0, 100))

\[\begin{align*} \text{log}(\lambda_i) &= \alpha \\ \alpha &\sim \text{Normal}(3,.5) \end{align*}\]

Simulate this:

Code
n_sim = 1e4
alpha_prior = rnorm(n_sim, 3, .5)
lambda_prior = exp(alpha_prior)
count_prior = rpois(n_sim, lambda_prior)
range(count_prior)
[1]   1 122
Code
mean(count_prior)
[1] 22.7839
Code
data.frame(count = count_prior) %>% 
  ggplot(aes(x = count)) +
  geom_density() +
  scale_x_continuous(limits = c(0, 100))

example

data(Kline, package = "rethinking")
Kline <- Kline
Kline
      culture population contact total_tools mean_TU
1    Malekula       1100     low          13     3.2
2     Tikopia       1500     low          22     4.7
3  Santa Cruz       3600     low          24     4.0
4         Yap       4791    high          43     5.0
5    Lau Fiji       7400    high          33     5.0
6   Trobriand       8000    high          19     4.0
7       Chuuk       9200    high          40     3.8
8       Manus      13000     low          28     6.6
9       Tonga      17500    high          55     5.4
10     Hawaii     275000     low          71     6.6
Kline = Kline %>% 
  mutate(
    P = log(population),
    contact_id = ifelse( contact == "high", 2, 1)
  )

Intercept only model

m1 <- brm(
  data = Kline, 
  family = poisson,
  total_tools ~ 1, 
  prior = c( prior( normal(3, 0.5), class = Intercept) ),
  iter = 2000, warmup = 1000, chains = 4, cores = 4, 
  seed = 11,
  file = here("files/data/generated_data/m52.1")
)
Kline = Kline %>% 
  mutate(
    P = log(population),
    contact_id = ifelse( contact == "high", 2, 1)
  )

Interaction model

m2 <- brm(
  data = Kline, 
  family = poisson,
  bf(total_tools ~ a + b*P, 
     a + b ~ 0 + contact,
     nl = TRUE), 
  prior = c( prior( normal(3, 0.5), nlpar = a),
             prior( normal(0, 0.2), nlpar = b)),
  iter = 2000, warmup = 1000, chains = 4, cores = 4, 
  seed = 11,
  file = here("files/data/generated_data/m52.2")
)
m1
 Family: poisson 
  Links: mu = log 
Formula: total_tools ~ 1 
   Data: Kline (Number of observations: 10) 
  Draws: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;
         total post-warmup draws = 4000

Regression Coefficients:
          Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
Intercept     3.54      0.05     3.44     3.64 1.00     1716     1970

Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
and Tail_ESS are effective sample size measures, and Rhat is the potential
scale reduction factor on split chains (at convergence, Rhat = 1).
m2
 Family: poisson 
  Links: mu = log 
Formula: total_tools ~ a + b * P 
         a ~ 0 + contact
         b ~ 0 + contact
   Data: Kline (Number of observations: 10) 
  Draws: 4 chains, each with iter = 2000; warmup = 1000; thin = 1;
         total post-warmup draws = 4000

Regression Coefficients:
              Estimate Est.Error l-95% CI u-95% CI Rhat Bulk_ESS Tail_ESS
a_contacthigh     2.84      0.47     1.92     3.74 1.00     1378     1405
a_contactlow      1.69      0.29     1.13     2.25 1.00     1439     1323
b_contacthigh     0.09      0.05    -0.01     0.19 1.00     1399     1489
b_contactlow      0.19      0.03     0.14     0.25 1.00     1444     1295

Draws were sampled using sampling(NUTS). For each parameter, Bulk_ESS
and Tail_ESS are effective sample size measures, and Rhat is the potential
scale reduction factor on split chains (at convergence, Rhat = 1).

What do these mean?

Once we’ve moved outside of the Gaussian distribution, your best bet is to push everything back through the posterior. Do NOT try and evaluate the model estimates.

nd <- data.frame(P = 1) # intercept only model

epred_draws(object = m1, newdata = nd) %>% 
  median_qi
# A tibble: 1 × 8
      P  .row .epred .lower .upper .width .point .interval
  <dbl> <int>  <dbl>  <dbl>  <dbl>  <dbl> <chr>  <chr>    
1     1     1   34.6   31.1   38.2   0.95 median qi       
nd <-
  distinct(Kline, contact) %>% 
  expand_grid(P = seq(from = min(Kline$P), 
                            to=max(Kline$P), 
                            length.out = 100))

f2 <- epred_draws(object = m2, newdata = nd) %>% 
  group_by(contact, P) %>% 
  median_qi

f2
# A tibble: 200 × 8
   contact     P .epred .lower .upper .width .point .interval
   <fct>   <dbl>  <dbl>  <dbl>  <dbl>  <dbl> <chr>  <chr>    
 1 high     7.00   31.7   24.9   40.2   0.95 median qi       
 2 high     7.06   31.9   25.1   40.3   0.95 median qi       
 3 high     7.11   32.0   25.4   40.3   0.95 median qi       
 4 high     7.17   32.2   25.6   40.3   0.95 median qi       
 5 high     7.23   32.4   25.8   40.3   0.95 median qi       
 6 high     7.28   32.5   26.1   40.4   0.95 median qi       
 7 high     7.34   32.7   26.3   40.4   0.95 median qi       
 8 high     7.39   32.8   26.6   40.4   0.95 median qi       
 9 high     7.45   33.0   26.9   40.5   0.95 median qi       
10 high     7.51   33.2   27.1   40.5   0.95 median qi       
# ℹ 190 more rows
f2 %>% 
  ggplot(aes(x = P)) + 
  geom_ribbon(aes(ymin = .lower, ymax = .upper, fill = contact), 
              alpha = .3) +
  geom_smooth(aes(y = .epred, color = contact)) +
  geom_point(data = Kline, aes(y = total_tools, color = contact)) +
  labs(x = "log population", y = "number of tools") 

compare

m1  <- add_criterion(m1, "loo")
m2 <- add_criterion(m2, "loo")

loo_compare(m1, m2, criterion = "loo") %>% print(simplify = F)
   elpd_diff se_diff elpd_loo se_elpd_loo p_loo se_p_loo looic se_looic
m2   0.0       0.0   -45.7      7.2         8.2   3.4     91.3  14.3   
m1 -24.9      14.0   -70.6     16.9         7.9   3.5    141.2  33.7   
loo(m2) %>% loo::pareto_k_table()
Pareto k diagnostic values:
                         Count Pct.    Min. ESS
(-Inf, 0.7]   (good)     9     90.0%   232     
   (0.7, 1]   (bad)      0      0.0%   <NA>    
   (1, Inf)   (very bad) 1     10.0%   <NA>    
m2k = m2$criteria$loo$pointwise %>% 
  as.data.frame() %>% 
  mutate(culture = Kline$culture) %>% 
  arrange(desc(influence_pareto_k)) %>% 
  mutate_if(is.double, round, digits = 2)
m2k
   elpd_loo mcse_elpd_loo p_loo looic influence_pareto_k    culture
1     -7.12          0.49  3.27 14.24               1.27     Hawaii
2     -6.44          0.06  1.60 12.87               0.56      Tonga
3     -9.23          0.05  1.82 18.45               0.43  Trobriand
4     -3.28          0.01  0.20  6.56               0.29      Manus
5     -4.51          0.04  0.77  9.02               0.29   Malekula
6     -3.71          0.02  0.30  7.42               0.26        Yap
7     -3.12          0.01  0.12  6.23               0.23   Lau Fiji
8     -2.74          0.01  0.06  5.47               0.17 Santa Cruz
9     -2.93          0.01  0.04  5.86               0.16      Chuuk
10    -2.59          0.00  0.03  5.19               0.13    Tikopia

Adding these to our plot:

Code
m2k = m2k %>% full_join(Kline)
library(ggrepel)
f2 %>% 
  ggplot(aes(x = P)) + 
  geom_ribbon(aes(ymin = .lower, ymax = .upper, fill = contact), 
              alpha = .3) +
  geom_smooth(aes(y = .epred, color = contact)) +
  geom_point(data = m2k, 
             aes(y = total_tools, 
                 size = influence_pareto_k,
                 color = contact)) +
  geom_text_repel(data = m2k, 
             aes(y = total_tools,
                 label = culture)) +
  guides(size = "none") +
  labs(x = "log population", y = "number of tools") +
  theme(legend.position = "top")

Natural scale

Code
m2k = m2k %>% full_join(Kline)
library(ggrepel)
f2 %>% 
  ggplot(aes(x = P)) + 
  geom_ribbon(aes(ymin = .lower, ymax = .upper, fill = contact), 
              alpha = .3) +
  geom_smooth(aes(y = .epred, color = contact)) +
  geom_point(data = m2k, 
             aes(y = total_tools, 
                 size = influence_pareto_k,
                 color = contact)) +
  geom_text_repel(data = m2k, 
             aes(y = total_tools,
                 label = culture)) +
  scale_x_continuous(trans = "exp",
                     breaks = log(c(0, 50000, 150000, 250000)),
                     labels = c(0, 50000, 150000, 250000)) +
  guides(size = "none") +
  labs(x = "population", y = "number of tools") +
  theme(legend.position = "top")

exercise

The data in data(Primates301) are 301 primate species and associated measures. Model the number of observations of social_learning for each species as a function of the log brain size. Use a Poisson distribution for the social_learning outcome variable. Interpret the resulting posterior.

solution

data(Primates301, package = "rethinking")
d <- Primates301
d$B <- log(d$brain)

m4 <- brm(
  data = d, 
  family = poisson,
  social_learning ~ 1 + B,
  prior = c( prior( normal(3,.5), class = Intercept),
             prior( normal(0, .1), class = b)),
  iter = 2000, warmup = 1000, chains = 4, cores = 4, 
  seed = 11,
  file = here("files/data/generated_data/m52.4")
)
Code
nd <- expand.grid(
  B = seq(min(d$B, na.rm=T), max(d$B, na.rm=T), length = 100)
)

f4 <- epred_draws(m4, nd) %>% 
  with_groups(B, median_qi, .epred)

ggplot(f4, aes(x = B, y = .epred)) +
  geom_ribbon(aes(ymin = .lower, ymax =.upper), alpha = .3) +
  geom_line() +
  geom_point(data = d, 
             aes(y = social_learning), 
             alpha = .3, 
             color = "#1c5253") +
  labs(x = "log brain size",
       y = "social learning")
Code
ggplot(f4, aes(x = B, y = .epred)) +
  geom_ribbon(aes(ymin = .lower, ymax =.upper), alpha = .3) +
  geom_line() +
  geom_point(data = d, 
             aes(y = social_learning), 
             alpha = .3, 
             color = "#1c5253") +
  scale_x_continuous(trans = "exp") +
  labs(x = "brain size",
       y = "social learning")

Returning to the tools example, McElreath points to a theoretical issue with the plots. He proposes to solve this with a better representation of the relationship between tools and time.

\[ \Delta T = \alpha P^{\beta}-\gamma T \]

  • \(\Delta\) = change
  • \(\alpha\) = innovation rate
  • \(\beta\) = diminishing returns (elasticity)
  • \(\gamma\) = rate of loss

In this case, both \(\alpha\) and \(\beta\) are moderated by contact.

If you solve for T:

\[ \hat{T} = \frac{\alpha_CP^{\beta_C}}{\gamma} \]

Note that in this case we’re not using the log link function. Wild! This is because all our parameters must be positive. There are two ways to do this: the first is to use appropriate priors that constrain the parameters. This is nice for transparency, but computationally more difficult. The other option is to use the exponential function (see on a). This code does both.

m3 <-
  brm(data = Kline, 
      family = poisson(link = "identity"),
      bf(total_tools ~ exp(a) * P^b / g,
         a + b ~ 0 + contact,
         g ~ 1,
         nl = TRUE),
      prior = c(prior(normal(1, 1), nlpar = a),
                prior(exponential(1), nlpar = b, lb = 0),
                prior(exponential(1), nlpar = g, lb = 0)),
      iter = 2000, warmup = 1000, chains = 4, cores = 4,
      seed = 11,
      control = list(adapt_delta = .85),
      file = here("files/data/generated_data/m52.3"))

exercise

Returning to the primate example: Some species are studied much more than others. So the number of reported instances of social_learning could be a product of research effort. Use the research_effort variable, specifically its logarithm, as an additional predictor variable. Interpret the coefficient for log research_effort. How does this model differ from the previous one?